In [2]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.templates.default = "plotly_white"
netflix_data = pd.read_csv("C:/Users/hussien/OneDrive/Desktop/netflix_content_2023.csv")

netflix_data.head()
Out[2]:
Title Available Globally? Release Date Hours Viewed Language Indicator Content Type
0 The Night Agent: Season 1 Yes 2023-03-23 81,21,00,000 English Show
1 Ginny & Georgia: Season 2 Yes 2023-01-05 66,51,00,000 English Show
2 The Glory: Season 1 // 더 글로리: 시즌 1 Yes 2022-12-30 62,28,00,000 Korean Show
3 Wednesday: Season 1 Yes 2022-11-23 50,77,00,000 English Show
4 Queen Charlotte: A Bridgerton Story Yes 2023-05-04 50,30,00,000 English Movie
In [3]:
netflix_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24812 entries, 0 to 24811
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Title                24812 non-null  object
 1   Available Globally?  24812 non-null  object
 2   Release Date         8166 non-null   object
 3   Hours Viewed         24812 non-null  object
 4   Language Indicator   24812 non-null  object
 5   Content Type         24812 non-null  object
dtypes: object(6)
memory usage: 1.1+ MB

Datacleaning and preprocessing¶

In [4]:
def clean_strings(series):
    """
    Clean and standardize strings in a Series.

    Args:
        series (pd.Series): The input Series with strings.

    Returns:
        pd.Series: Series with cleaned and standardized strings.
    """
    cleaned_series = series.str.replace(',', '', regex=True).astype(float)
    return cleaned_series
In [5]:
# Usage
netflix_data['Hours Viewed'] = clean_strings(netflix_data['Hours Viewed'])
In [6]:
# aggregate viewership hours by content type and return the figure
def aggregate_viewership_hours(dataframe,group_column,agg_column,aggregate_function):
  """
  Aggregate viewership hours by content type.
  Aggregate categorical data based on a grouping column.

    Args:
        dataframe (pd.DataFrame): The input DataFrame.
        group_column (str): Column for grouping data.
        agg_column (str): Column to perform aggregation on.
        aggregation (str): Aggregation function ('count', 'sum', 'mean', 'median').

    Returns:
        pd.DataFrame: Aggregated DataFrame."""

  if aggregate_function == 'count':
          aggregated_data = dataframe.groupby(group_column)[agg_column].count()
  elif aggregate_function == 'sum':
      aggregated_data = dataframe.groupby(group_column)[agg_column].sum()
  elif aggregate_function == 'mean':
      aggregated_data = dataframe.groupby(group_column)[agg_column].mean()
  elif aggregate_function == 'median':
      aggregated_data = dataframe.groupby(group_column)[agg_column].median()
  else:
      raise ValueError("Invalid aggregation function.")

  return aggregated_data.reset_index()
In [7]:
content_type_viewership=aggregate_viewership_hours(netflix_data,'Content Type','Hours Viewed','sum')
In [8]:
## visulizate the total viewership hours by content type

fig = px.pie(content_type_viewership, values='Hours Viewed', names='Content Type', title='Total Viewership Hours by Content Type')
fig.show()
In [9]:
# aggregate viewership hours by language
language_viewership = aggregate_viewership_hours(netflix_data,'Language Indicator','Hours Viewed','sum')

language_viewership.sort_values(by='Hours Viewed',ascending=False,inplace=True)
fig = go.Figure(data=[
    go.Bar(
        x=language_viewership.index,
        y=language_viewership.values,
        marker_color='lightcoral'
    )
])

fig.update_layout(
    title='Total Viewership Hours by Language (2023)',
    xaxis_title='Language',
    yaxis_title='Total Hours Viewed (in billions)',
    xaxis_tickangle=45,
    height=600,
    width=1000
)

fig.show()
In [10]:
# convert the "Release Date" to a datetime format and extract the month
netflix_data['Release Date'] = pd.to_datetime(netflix_data['Release Date'])
netflix_data['Release Month'] = netflix_data['Release Date'].dt.month

# aggregate viewership hours by release month
monthly_viewership = netflix_data.groupby('Release Month')['Hours Viewed'].sum()

fig = go.Figure(data=[
    go.Scatter(
        x=monthly_viewership.index,
        y=monthly_viewership.values,
        mode='lines+markers',
        marker=dict(color='blue'),
        line=dict(color='blue')
    )
])

fig.update_layout(
    title='Total Viewership Hours by Release Month (2023)',
    xaxis_title='Month',
    yaxis_title='Total Hours Viewed (in billions)',
    xaxis=dict(
        tickmode='array',
        tickvals=list(range(1, 13)),
        ticktext=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    ),
    height=600,
    width=1000
)

fig.show()
In [11]:
top_5_titles = netflix_data.nlargest(5, 'Hours Viewed')

top_5_titles[['Title', 'Hours Viewed', 'Language Indicator', 'Content Type', 'Release Date']]
Out[11]:
Title Hours Viewed Language Indicator Content Type Release Date
0 The Night Agent: Season 1 812100000.0 English Show 2023-03-23
1 Ginny & Georgia: Season 2 665100000.0 English Show 2023-01-05
18227 King the Land: Limited Series // 킹더랜드: 리미티드 시리즈 630200000.0 Korean Movie 2023-06-17
2 The Glory: Season 1 // 더 글로리: 시즌 1 622800000.0 Korean Show 2022-12-30
18214 ONE PIECE: Season 1 541900000.0 English Show 2023-08-31
In [12]:
## # aggregate viewership hours by content type and release month by use the pivot table
monthly_viewership_by_type = netflix_data.pivot_table(index='Release Month',
                                                      columns='Content Type',
                                                      values='Hours Viewed',
                                                      aggfunc='sum')
monthly_viewership_by_type
Out[12]:
Content Type Movie Show
Release Month
1.0 2.275900e+09 4.995700e+09
2.0 1.654400e+09 5.449300e+09
3.0 2.109400e+09 5.327700e+09
4.0 2.757600e+09 4.108100e+09
5.0 2.520500e+09 4.574100e+09
6.0 3.135800e+09 5.386200e+09
7.0 1.615700e+09 4.909100e+09
8.0 2.186400e+09 4.631400e+09
9.0 2.092300e+09 5.169900e+09
10.0 3.400400e+09 4.722800e+09
11.0 1.866900e+09 5.882600e+09
12.0 2.554900e+09 7.500900e+09
In [13]:
## visulize the pivot table
fig = go.Figure()

for content_type in monthly_viewership_by_type.columns:
    fig.add_trace(
        go.Scatter(
            x=monthly_viewership_by_type.index,
            y=monthly_viewership_by_type[content_type],
            mode='lines+markers',
            name=content_type
        )
    )

fig.update_layout(
    title='Viewership Trends by Content Type and Release Month (2023)',
    xaxis_title='Month',
    yaxis_title='Total Hours Viewed (in billions)',
    xaxis=dict(
        tickmode='array',
        tickvals=list(range(1, 13)),
        ticktext=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    ),
    height=600,
    width=1000,
    legend_title='Content Type'
)

fig.show()
In [14]:
# define seasons based on release months
def get_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Fall'

# apply the season categorization to the dataset
netflix_data['Release Season'] = netflix_data['Release Month'].apply(get_season)

# aggregate viewership hours by release season
seasonal_viewership = netflix_data.groupby('Release Season')['Hours Viewed'].sum()

# order the seasons as 'Winter', 'Spring', 'Summer', 'Fall'
seasons_order = ['Winter', 'Spring', 'Summer', 'Fall']
seasonal_viewership = seasonal_viewership.reindex(seasons_order)

fig = go.Figure(data=[
    go.Bar(
        x=seasonal_viewership.index,
        y=seasonal_viewership.values,
        marker_color='orange'
    )
])

fig.update_layout(
    title='Total Viewership Hours by Release Season (2023)',
    xaxis_title='Season',
    yaxis_title='Total Hours Viewed (in billions)',
    xaxis_tickangle=0,
    height=500,
    width=800,
    xaxis=dict(
        categoryorder='array',
        categoryarray=seasons_order
    )
)

fig.show()
In [15]:
monthly_releases = netflix_data['Release Month'].value_counts().sort_index()

monthly_viewership = netflix_data.groupby('Release Month')['Hours Viewed'].sum()

fig = go.Figure()

fig.add_trace(
    go.Bar(
        x=monthly_releases.index,
        y=monthly_releases.values,
        name='Number of Releases',
        marker_color='goldenrod',
        opacity=0.7,
        yaxis='y1'
    )
)

fig.add_trace(
    go.Scatter(
        x=monthly_viewership.index,
        y=monthly_viewership.values,
        name='Viewership Hours',
        mode='lines+markers',
        marker=dict(color='red'),
        line=dict(color='red'),
        yaxis='y2'
    )
)

fig.update_layout(
    title='Monthly Release Patterns and Viewership Hours (2023)',
    xaxis=dict(
        title='Month',
        tickmode='array',
        tickvals=list(range(1, 13)),
        ticktext=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    ),
    yaxis=dict(
        title='Number of Releases',
        showgrid=False,
        side='left'
    ),
    yaxis2=dict(
        title='Total Hours Viewed (in billions)',
        overlaying='y',
        side='right',
        showgrid=False
    ),
    legend=dict(
        x=1.05,
        y=1,
        orientation='v',
        xanchor='left'
    ),
    height=600,
    width=1000
)

fig.show()
In [16]:
# define significant holidays and events in 2023
important_dates = [
    '2023-01-01',  # new year's day
    '2023-02-14',  # valentine's ay
    '2023-07-04',  # independence day (US)
    '2023-10-31',  # halloween
    '2023-12-25'   # christmas day
]

# convert to datetime
important_dates = pd.to_datetime(important_dates)

# check for content releases close to these significant holidays (within a 3-day window)
holiday_releases = netflix_data[netflix_data['Release Date'].apply(
    lambda x: any((x - date).days in range(-3, 4) for date in important_dates)
)]

# aggregate viewership hours for releases near significant holidays
holiday_viewership = holiday_releases.groupby('Release Date')['Hours Viewed'].sum()

holiday_releases[['Title', 'Release Date', 'Hours Viewed']]
Out[16]:
Title Release Date Hours Viewed
2 The Glory: Season 1 // 더 글로리: 시즌 1 2022-12-30 622800000.0
6 La Reina del Sur: Season 3 2022-12-30 429600000.0
11 Kaleidoscope: Limited Series 2023-01-01 252500000.0
29 Perfect Match: Season 1 2023-02-14 176800000.0
124 Lady Voyeur: Limited Series // Olhar Indiscret... 2022-12-31 86000000.0
... ... ... ...
22324 The Romantics: Limited Series 2023-02-14 1000000.0
22327 Aggretsuko: Season 5 // アグレッシブ烈子: シーズン5 2023-02-16 900000.0
22966 The Lying Life of Adults: Limited Series // La... 2023-01-04 900000.0
22985 Community Squad: Season 1 // División Palermo:... 2023-02-17 800000.0
24187 Live to Lead: Limited Series 2022-12-31 400000.0

98 rows × 3 columns